22 marca 2017
rozkład dwumianowy
?rbinom
rozkład dwumianowy
?rnorm
(b1 <- rbinom(20, 1, 1))
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
(b2 <- rbinom(20, 1, .9))
## [1] 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1
(b3 <- rbinom(20, 1, .5))
## [1] 1 1 1 1 1 1 0 1 1 0 1 0 1 0 1 0 0 0 1 0
(b4 <- rbinom(20, 1, .1))
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
(n1 <- rnorm(10))
## [1] -1.5184260 0.6689521 0.7037585 -0.8418143 0.7825232 -1.0767775 ## [7] -1.9932618 -0.8253611 -1.3153762 -0.4782327
(n2 <- rnorm(20))
## [1] 0.84539382 1.03132417 -0.12299760 -0.07523712 0.03745902 ## [6] -1.01659658 -0.26086776 0.80222020 -0.89632274 -1.96804403 ## [11] -0.79216019 1.11885556 0.74299087 -1.21813708 -0.67435618 ## [16] 0.71137073 -0.32231317 -0.01406886 -0.93261898 -0.28718003
(n3 <- rnorm(10, 50, 1))
## [1] 51.15351 48.27386 50.78876 48.60620 50.84979 50.70957 51.18439 ## [8] 49.93303 50.59466 49.25930
n4 <- rnorm(1000, 50, 10)
summary(n1)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -1.9933 -1.2557 -0.8336 -0.5894 0.3822 0.7825
#summary(n2) summary(n3)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 48.27 49.43 50.65 50.14 50.83 51.18
summary(n4)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 12.61 43.63 50.14 50.11 56.68 84.93
help1 <- rbind(data.frame(val=n1, set=1), data.frame(val=n2, set=2),
data.frame(val=n3, set=3), data.frame(val=n4, set=4))
head(help1)
## val set ## 1 -1.5184260 1 ## 2 0.6689521 1 ## 3 0.7037585 1 ## 4 -0.8418143 1 ## 5 0.7825232 1 ## 6 -1.0767775 1
tail(help1)
## val set ## 1035 54.79787 4 ## 1036 47.72527 4 ## 1037 34.81777 4 ## 1038 47.22181 4 ## 1039 55.81132 4 ## 1040 55.90293 4
boxplot(help1$val~help1$set)
dat <- rnorm(500) shapiro.test(dat)
## ## Shapiro-Wilk normality test ## ## data: dat ## W = 0.9956, p-value = 0.1723
dat <- runif(100000) #shapiro.test(dat) #Error in shapiro.test(dat_log) : sample size must be between 3 and 5000 #shapiro.test(dat_log) shapiro.test(sample(dat, 500))
## ## Shapiro-Wilk normality test ## ## data: sample(dat, 500) ## W = 0.95813, p-value = 1.018e-10
hipoteza zerowa: rozkład próby X jest taki sam, jak rozkład próby Y
https://www.rdocumentation.org/packages/stats/versions/3.3.2/topics/ks.test
x <- rnorm(100) y <- rnorm(100) z <- runif(100) tail(x)
## [1] 0.6188239 -0.1442300 1.6741212 1.3985252 -0.8841180 -0.2732202
head(y)
## [1] -0.6225165 0.8996031 0.3825920 -0.1568072 -1.4143547 -0.9643388
z[1:20]
## [1] 0.4585631 0.4013670 0.4042408 0.8762633 0.3068749 0.8540851 0.3343854 ## [8] 0.9899785 0.4388535 0.3241845 0.4759101 0.3399886 0.7986054 0.4128282 ## [15] 0.1432191 0.8008813 0.8791489 0.7206995 0.4407284 0.1555384
ks.test(x, y)
## ## Two-sample Kolmogorov-Smirnov test ## ## data: x and y ## D = 0.11, p-value = 0.5806 ## alternative hypothesis: two-sided
ks.test(x, z)
## ## Two-sample Kolmogorov-Smirnov test ## ## data: x and z ## D = 0.44, p-value = 7.818e-09 ## alternative hypothesis: two-sided
założenie: próba pochodzi z rozkładu normalnego
t.test(x, mu=liczba)
https://www.datacamp.com/courses/intro-to-statistics-with-r-student-s-t-test
x <- rnorm(1000) t.test(x, mu=0)
## ## One Sample t-test ## ## data: x ## t = 0.086889, df = 999, p-value = 0.9308 ## alternative hypothesis: true mean is not equal to 0 ## 95 percent confidence interval: ## -0.05852052 0.06394301 ## sample estimates: ## mean of x ## 0.002711243
z <- rnorm(1000, 10, 1) t.test(z, mu=0)
## ## One Sample t-test ## ## data: z ## t = 319.26, df = 999, p-value < 2.2e-16 ## alternative hypothesis: true mean is not equal to 0 ## 95 percent confidence interval: ## 9.957597 10.080764 ## sample estimates: ## mean of x ## 10.01918
założenia: próby pochodzą z rozkładu normalnego
t.test(x, y)
x <- rnorm(1000, 10, 5) y <- rnorm(800, 10, 10) z <- rnorm(900, 15, 8) summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -6.864 6.580 9.852 9.843 13.139 26.090
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -16.014 2.598 9.625 9.804 16.934 39.264
summary(z)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -5.30 10.09 15.29 15.26 20.79 36.84
t.test(x, y)
## ## Welch Two Sample t-test ## ## data: x and y ## t = 0.10319, df = 1119.9, p-value = 0.9178 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -0.7077602 0.7863370 ## sample estimates: ## mean of x mean of y ## 9.842930 9.803641
t.test(x, z)
## ## Welch Two Sample t-test ## ## data: x and z ## t = -18.016, df = 1505.5, p-value < 2.2e-16 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -6.001827 -4.823196 ## sample estimates: ## mean of x mean of y ## 9.84293 15.25544
próby są równoliczne
t.test(x, y, paired = T)
x <- rnorm(1000, 10, 5) y <- rnorm(1000, 10, 1) z <- rnorm(1000, 15, 5) summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -5.128 6.386 9.948 9.831 13.039 26.545
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 6.751 9.353 10.076 10.025 10.721 13.438
summary(z)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -1.325 11.351 14.973 14.881 18.308 34.701
t.test(x, y, paired=T)
## ## Paired t-test ## ## data: x and y ## t = -1.1962, df = 999, p-value = 0.2319 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -0.5121138 0.1242132 ## sample estimates: ## mean of the differences ## -0.1939503
t.test(x, z, paired=T)
## ## Paired t-test ## ## data: x and z ## t = -21.609, df = 999, p-value < 2.2e-16 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -5.508599 -4.591418 ## sample estimates: ## mean of the differences ## -5.050008
uwaga: brak założeń dotyczących rozkładu prób
wilcox.test(x, y, paired=T)
x <- runif(100) y <- rnorm(100) wilcox.test(x, mu=0)
## ## Wilcoxon signed rank test with continuity correction ## ## data: x ## V = 5050, p-value < 2.2e-16 ## alternative hypothesis: true location is not equal to 0
wilcox.test(y, mu=0)
## ## Wilcoxon signed rank test with continuity correction ## ## data: y ## V = 2798, p-value = 0.3488 ## alternative hypothesis: true location is not equal to 0
wilcox.test(x, y)
## ## Wilcoxon rank sum test with continuity correction ## ## data: x and y ## W = 6476, p-value = 0.0003119 ## alternative hypothesis: true location shift is not equal to 0
wilcox.test(x, y, paired=T)
## ## Wilcoxon signed rank test with continuity correction ## ## data: x and y ## V = 3522, p-value = 0.0006119 ## alternative hypothesis: true location shift is not equal to 0
uogólnienie testu Wilcoxona
kruskal.test(list(próba1, próba2, próba3, …))
w <- runif(200, 1, 2) x <- runif(200) y <- runif(200) z <- rnorm(200, .5, 1) summary(w)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 1.004 1.258 1.476 1.496 1.755 1.997
summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.003137 0.243102 0.496421 0.494187 0.756661 0.998796
#summary(y) summary(z)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -1.7373 -0.1073 0.5266 0.5430 1.2266 2.7702
kruskal.test(list(x, y, z))
## ## Kruskal-Wallis rank sum test ## ## data: list(x, y, z) ## Kruskal-Wallis chi-squared = 0.67071, df = 2, p-value = 0.7151
kruskal.test(list(w, x, y, z))
## ## Kruskal-Wallis rank sum test ## ## data: list(w, x, y, z) ## Kruskal-Wallis chi-squared = 354.42, df = 3, p-value < 2.2e-16
hipoteza zerowa: H0: var1 = var2
var.test(próba1, próba2)
x <- runif(200) y <- runif(200) z <- rnorm(200, .5, 1) summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.003799 0.287075 0.527834 0.518653 0.746871 0.982079
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.01894 0.29069 0.50084 0.51224 0.75650 0.99576
summary(z)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -2.5985 -0.2627 0.5079 0.4284 1.1275 2.7071
var.test(x, y)
## ## F test to compare two variances ## ## data: x and y ## F = 0.99257, num df = 199, denom df = 199, p-value = 0.9581 ## alternative hypothesis: true ratio of variances is not equal to 1 ## 95 percent confidence interval: ## 0.7511625 1.3115566 ## sample estimates: ## ratio of variances ## 0.9925684
var.test(x, z)
## ## F test to compare two variances ## ## data: x and z ## F = 0.081908, num df = 199, denom df = 199, p-value < 2.2e-16 ## alternative hypothesis: true ratio of variances is not equal to 1 ## 95 percent confidence interval: ## 0.0619868 0.1082312 ## sample estimates: ## ratio of variances ## 0.0819079
hipoteza zerowa: H0: var1 = var2
ansari.test(próba1, próba2)
x <- runif(200) y <- runif(200) z <- rnorm(200, .5, 1) summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.006277 0.270478 0.524086 0.513665 0.755969 0.995420
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.004833 0.256814 0.522670 0.497526 0.722511 0.978571
summary(z)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -2.2208 -0.1506 0.4614 0.4888 1.0571 3.1471
ansari.test(x, y)
## ## Ansari-Bradley test ## ## data: x and y ## AB = 19868, p-value = 0.6882 ## alternative hypothesis: true ratio of scales is not equal to 1
ansari.test(x, z)
## ## Ansari-Bradley test ## ## data: x and z ## AB = 26296, p-value < 2.2e-16 ## alternative hypothesis: true ratio of scales is not equal to 1
hipoteza zerowa: H0: var1 = var2 = var3 = ..
bartlett.test(x, g=czynnik dzielący dane)
x <- runif(200) y <- runif(200) z <- rnorm(200, .5, 1) bartlett.test(list(x, y, z))
## ## Bartlett test of homogeneity of variances ## ## data: list(x, y, z) ## Bartlett's K-squared = 427.66, df = 2, p-value < 2.2e-16
zastosowanie: testowanie możliwości wystąpienia określonej liczby sukcesów w zadanej liczbie prób przy wskazanym prawdopodobieństwie
prop.test(liczba sukcesów, liczba prób, prawdopodobieństwo)
prop.test(50, 100, .5)
## ## 1-sample proportions test without continuity correction ## ## data: 50 out of 100, null probability 0.5 ## X-squared = 0, df = 1, p-value = 1 ## alternative hypothesis: true p is not equal to 0.5 ## 95 percent confidence interval: ## 0.4038315 0.5961685 ## sample estimates: ## p ## 0.5
prop.test(50, 100, .25)
## ## 1-sample proportions test with continuity correction ## ## data: 50 out of 100, null probability 0.25 ## X-squared = 32.013, df = 1, p-value = 1.531e-08 ## alternative hypothesis: true p is not equal to 0.25 ## 95 percent confidence interval: ## 0.3990211 0.6009789 ## sample estimates: ## p ## 0.5
hipoteza zerowa: poszczególne cechy w populacji występują niezależnie od siebie, z równą częstością
chisq.test(table(zbiór danych))
## From Agresti(2007) p.39
M <- as.table(rbind(c(762, 327, 468), c(484, 239, 477)))
dimnames(M) <- list(gender = c("F", "M"),
party = c("Democrat","Independent", "Republican"))
M
## party ## gender Democrat Independent Republican ## F 762 327 468 ## M 484 239 477
chisq.test(M)
## ## Pearson's Chi-squared test ## ## data: M ## X-squared = 30.07, df = 2, p-value = 2.954e-07
lek=c(19,41,60) ctl=c(46,19,15) chisq.test(cbind(lek,ctl))
## ## Pearson's Chi-squared test ## ## data: cbind(lek, ctl) ## X-squared = 39.877, df = 2, p-value = 2.192e-09
## From Agresti (1990, p. 61f; 2002, p. 91)
TeaTasting <- matrix(c(3, 1, 1, 3),
nrow = 2,
dimnames = list(Guess = c("Milk", "Tea"),
Truth = c("Milk", "Tea")))
TeaTasting
## Truth ## Guess Milk Tea ## Milk 3 1 ## Tea 1 3
fisher.test(TeaTasting)
## ## Fisher's Exact Test for Count Data ## ## data: TeaTasting ## p-value = 0.4857 ## alternative hypothesis: true odds ratio is not equal to 1 ## 95 percent confidence interval: ## 0.2117329 621.9337505 ## sample estimates: ## odds ratio ## 6.408309
#install.packages("agricolae")
library(agricolae)
data(sweetpotato)
sweetpotato
## virus yield ## 1 cc 28.5 ## 2 cc 21.7 ## 3 cc 23.0 ## 4 fc 14.9 ## 5 fc 10.6 ## 6 fc 13.1 ## 7 ff 41.8 ## 8 ff 39.2 ## 9 ff 28.0 ## 10 oo 38.2 ## 11 oo 40.4 ## 12 oo 32.1
model <- aov(yield~virus, data=sweetpotato) model
## Call: ## aov(formula = yield ~ virus, data = sweetpotato) ## ## Terms: ## virus Residuals ## Sum of Squares 1170.2092 179.9133 ## Deg. of Freedom 3 8 ## ## Residual standard error: 4.742274 ## Estimated effects may be unbalanced
summary(model)
## Df Sum Sq Mean Sq F value Pr(>F) ## virus 3 1170.2 390.1 17.34 0.000733 *** ## Residuals 8 179.9 22.5 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
duncan.test(model, "virus",console=TRUE)
## ## Study: model ~ "virus" ## ## Duncan's new multiple range test ## for yield ## ## Mean Square Error: 22.48917 ## ## virus, means ## ## yield std r Min Max ## cc 24.40000 3.609709 3 21.7 28.5 ## fc 12.86667 2.159475 3 10.6 14.9 ## ff 36.33333 7.333030 3 28.0 41.8 ## oo 36.90000 4.300000 3 32.1 40.4 ## ## Alpha: 0.05 ; DF Error: 8 ## ## Critical Range ## 2 3 4 ## 8.928965 9.304825 9.514910 ## ## Means with the same letter are not significantly different. ## ## yield groups ## oo 36.90000 a ## ff 36.33333 a ## cc 24.40000 b ## fc 12.86667 c
(out <- duncan.test(model, "virus",console=TRUE))
## ## Study: model ~ "virus" ## ## Duncan's new multiple range test ## for yield ## ## Mean Square Error: 22.48917 ## ## virus, means ## ## yield std r Min Max ## cc 24.40000 3.609709 3 21.7 28.5 ## fc 12.86667 2.159475 3 10.6 14.9 ## ff 36.33333 7.333030 3 28.0 41.8 ## oo 36.90000 4.300000 3 32.1 40.4 ## ## Alpha: 0.05 ; DF Error: 8 ## ## Critical Range ## 2 3 4 ## 8.928965 9.304825 9.514910 ## ## Means with the same letter are not significantly different. ## ## yield groups ## oo 36.90000 a ## ff 36.33333 a ## cc 24.40000 b ## fc 12.86667 c
## $statistics ## MSerror Df Mean CV ## 22.48917 8 27.625 17.1666 ## ## $parameters ## test name.t ntr alpha ## Duncan virus 4 0.05 ## ## $duncan ## Table CriticalRange ## 2 3.261182 8.928965 ## 3 3.398460 9.304825 ## 4 3.475191 9.514910 ## ## $means ## yield std r Min Max Q25 Q50 Q75 ## cc 24.40000 3.609709 3 21.7 28.5 22.35 23.0 25.75 ## fc 12.86667 2.159475 3 10.6 14.9 11.85 13.1 14.00 ## ff 36.33333 7.333030 3 28.0 41.8 33.60 39.2 40.50 ## oo 36.90000 4.300000 3 32.1 40.4 35.15 38.2 39.30 ## ## $comparison ## NULL ## ## $groups ## yield groups ## oo 36.90000 a ## ff 36.33333 a ## cc 24.40000 b ## fc 12.86667 c ## ## attr(,"class") ## [1] "group"
out$groups
## yield groups ## oo 36.90000 a ## ff 36.33333 a ## cc 24.40000 b ## fc 12.86667 c
bar.err(out$means, horiz=T, xlim=c(0, 50))
https://www.rstudio.com/rviews/2017/02/22/how-to-teach-r-common-mistakes/